#delimit;
clear;
set more off;
capture log close;

capture erase /data/doed_loan_raw.dta;

!st data/doed_loan_raw.sas7bdat /data/doed_loan_raw.dta

/*********************
This programs creates a dataset with the needed information from NSLDS loans
to create the degree variables and the dataset to calculate how much
money was disbursed in federal loans by age
**********************/;

/*****
CHANGE PATH
*****/;
local in "/data";
local pathtab2 "/data";

/****************
I'm dropping the loans that were matched based on matching criteria=5
*****************/;

use "`in'/doed_loan_raw.dta", clear;

format dt %td;
format per_beg_dt %td;
format per_end_dt %td;
format loan_stat_dt %td;
format curr_mat_dt %td;
format curr_dis_dt %td;
format curr_can_dt %td;
format out_prin_bal_dt %td;


gen loan_type_num=1 if loan_type=="D1";
replace loan_type_num=2 if loan_type=="D2";
replace loan_type_num=3 if loan_type=="D3";
replace loan_type_num=4 if loan_type=="D4";
replace loan_type_num=5 if loan_type=="D5";
replace loan_type_num=6 if loan_type=="D6";
replace loan_type_num=7 if loan_type=="D7";
replace loan_type_num=8 if loan_type=="EU";
replace loan_type_num=9 if loan_type=="FI";
replace loan_type_num=10 if loan_type=="IC";
replace loan_type_num=11 if loan_type=="GB";
replace loan_type_num=12 if loan_type=="NU";
replace loan_type_num=13 if loan_type=="PE";
replace loan_type_num=14 if loan_type=="PK";
replace loan_type_num=15 if loan_type=="PL";
replace loan_type_num=16 if loan_type=="SE";
replace loan_type_num=17 if loan_type=="SF";
replace loan_type_num=18 if loan_type=="SL";
replace loan_type_num=19 if loan_type=="SU";
replace loan_type_num=20 if loan_type=="SG";
replace loan_type_num=21 if loan_type=="CL";
replace loan_type_num=22 if loan_type=="PU";
replace loan_type_num=23 if loan_type=="DU";
replace loan_type_num=24 if loan_type=="RF";

label var loan_type_num "Type of loan (numeric var of loan_type)";
label define loan_type_lab 1 "1=Direct Staf Sub" 2 "2=Direct Staf Unsub" 3 "3=Direct Plus Grad" 4 "4=Direct plus"
	 5 "5=Direct cons Unsub" 6 "6=Direct cons sub" 7 "7=Direct plus cons" 8 "8=Perk Expanded Lending" 9 "9=Federalluy Insured (FISL)"
	 10 "10=ICL" 11 "11=FFEL PLUS Grad" 12 "12=NDSL" 13 "13=Fed Pell Grant" 14 "14=Fed Perkins Loan" 15 "15=FFEL PLUS Loan" 16 "16=FSEOG" 17 "17=FFEL Staf Sub" 18 "18=Supplemental Loan (SLS)"
	 19 "19=FFEL Staf Unsub" 20 "20=Nat'l Sci. & Math Access to retain talent grant" 21 "21=FFEL Cons Loan"  22 "22=Perk Loan"  23 "23=SNat'l Defense Loan"  24 "24=FFEL Refi Loan";
label values loan_type_num loan_type_lab;


gen acad_lvl_num=1 if acad_lvl=="1";
replace acad_lvl_num=2 if acad_lvl=="2";
replace acad_lvl_num=3 if acad_lvl=="3";
replace acad_lvl_num=4 if acad_lvl=="4";
replace acad_lvl_num=5 if acad_lvl=="5";
replace acad_lvl_num=6 if acad_lvl=="A";
replace acad_lvl_num=7 if acad_lvl=="B";
replace acad_lvl_num=8 if acad_lvl=="C";
replace acad_lvl_num=9 if acad_lvl=="D";
replace acad_lvl_num=10 if acad_lvl=="G";
replace acad_lvl_num=11 if acad_lvl=="N";

label var acad_lvl_num "Year of school (numeric var of acad_lvl)";

label define acad_lvl_lab 1 "1=Freshman" 2 "2=Sophomore" 3 "3=Junior" 4 "4=Senior" 5 "5=Fifth year" 6 "6=First year graduate" 7 "7=Second year graduate"
	8 "8=Third year graduate" 9 "9=Beyond third year graduate" 10 "10=Graduate, year of study unknown" 11 "11=Not available";
label values acad_lvl_num acad_lvl_lab;

label define match_criteria 1 "1=SSN, first 3 letters, and DOB" 2 "2=SSN and DOB" 3 "3=SSN and fisrt 3 letters" 4 "4=SSN only" 5 "5=First 3 letters and DOB";
label values match_criteria match_criteria;
label var match_criteria "how the information was matched";


sort tuedt2003 dt loan_type loan_stat_dt;

label var dt "Date the loan was originated";
label var per_beg_dt "Beginning of Period for the loan (indicates beginning of enrollment, I think)";
label var per_end_dt "Ending of Period for the loan (indicates ending of enrollment, I think)";
label var loan_stat_dt "Date reported status became effective";
label var curr_mat_dt "Day when repayment starts";
label var int_rt "Interest rate";
label var int_rt_code "Whether rate is fixed or flexible";
label var loan_grty "Amount agreed to be disbursed";
label var tot_dis "Amount actually disbursed";
label var curr_dis_dt "Date when the amount was disbursed";
label var tot_can "Amount cancelled (not disbursed)";
label var curr_can_dt "date when the loan was cancelled";
label var out_prin_bal "Outstanding pricipal balance by out_prin_bal_dt (incl capitalized interest)";
label var out_prin_bal_dt "Date corresponding to out_prin_bal";
label var loan_seq_no "Number of loan";

  

rename tuedt2003 pid;

/*The cases where match_criteria==5 look very bad. I'm going to drop them*/
drop if match_criteria==5;

sort pid dt loan_seq_no;

/*curr_can_dt is equal to 01jan2001 for all the cases when the student loan was not cancelled. I'll change these cases to missing*/;
replace curr_can_dt=. if tot_can==0;
/*curr_dist_dt is usually equal to 01jan2001 when tot_dis==0. I'll code it as missing, since if the loan was not disbursed, we shouldn't have
a date of disbursement*/;
replace curr_dis_dt=. if tot_dis==0;
/*there are very few cases where tot_dis==0 and out_pric_bal~=0. I'll replace them by .*/;
replace out_prin_bal=0 if tot_dis==0;

/*Consolidated loan*/;
gen cons_loan=(loan_type_num==5 | loan_type_num==6 | loan_type_num==21);
/*Parent Plus loan*/
gen plus_par_loan=(loan_type_num==4 | loan_type_num==15);


/*If the loan is a consolidated loan, per_beg_dt and per_end_dt has no meaning, and they almost always take the value 01jan2001. I'll replace them for missing*/
replace per_beg_dt=. if cons_loan==1;
replace per_end_dt=. if cons_loan==1;

/*I will also replace the opeid by missing if the loan is consolidated, since these loans have an opeid equal to 88888888800*/;
replace opeid="" if cons_loan==1;

drop if cons_loan==1;
drop if tot_dis==0; /*I don't want to include cancelled loans in any way, since
we don't know if the person actually went to school during that period. I hope
that if this is the case, NSC will capture that info*/

preserve;
merge m:1 pid using "`pathtab2'/age_tomerge.dta";
drop _m;

drop if plus_par_loan==1; /*I don't want to sum loans that have been taken by parents*/

drop if ((dt-dob)/365<15); /*Dropping loans that were opened before the person turned 15y.o.*/
drop if ((per_beg_dt-dob)/365<15); /*This drops 63 loans. They all have per_beg_dt=Jan1900 and the loan type is PU. They are clearly wrong*/

sort pid loan_seq_no loan_type per_beg_dt loan_stat_dt;

by pid loan_seq_no loan_type per_beg_dt: keep if _n==1;

keep pid dob dob_y dob_m age_2004 loan_seq_no loan_type per_beg_dt per_end_dt dt tot_dis;

/*Here I create the variable of total federal student loans disbursed by age*/
sort pid loan_seq_no loan_type per_beg_dt;
forvalue yo=23(1)35 {;
gen tot_fedloans_t_`yo'=0;
replace tot_fedloans_t_`yo'=tot_dis if (per_beg_dt-dob)/365<`yo';
by pid: egen tot_fedloans_`yo'=sum(tot_fedloans_t_`yo');
local yo_1=`yo'-1;
rename tot_fedloans_`yo' tot_fedloans_`yo_1';
drop tot_fedloans_t_`yo';
};
keep pid tot_fedloans_2* tot_fedloans_3*;
by pid: keep if _n==1;
save "`pathtab2'/tot_fedloans.dta", replace;

restore;

keep pid per_beg_dt per_end_dt acad_lvl_num opeid;
sort pid per_beg_dt;
save "`pathtab2'/nslds_tomerge_degree.dta", replace;
